In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline
from warnings import filterwarnings 
filterwarnings("ignore")
D:\anaconda files\lib\site-packages\scipy\__init__.py:155: UserWarning: A NumPy version >=1.18.5 and <1.25.0 is required for this version of SciPy (detected version 1.26.4
  warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
In [2]:
data=pd.read_csv("C:\\Users\\laxma\\Downloads\\diabetes.csv")
data
Out[2]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2.288 33 1
... ... ... ... ... ... ... ... ... ...
763 10 101 76 48 180 32.9 0.171 63 0
764 2 122 70 27 0 36.8 0.340 27 0
765 5 121 72 23 112 26.2 0.245 30 0
766 1 126 60 0 0 30.1 0.349 47 1
767 1 93 70 31 0 30.4 0.315 23 0

768 rows × 9 columns

In [3]:
data.head()
Out[3]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2.288 33 1
In [4]:
data.tail()
Out[4]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
763 10 101 76 48 180 32.9 0.171 63 0
764 2 122 70 27 0 36.8 0.340 27 0
765 5 121 72 23 112 26.2 0.245 30 0
766 1 126 60 0 0 30.1 0.349 47 1
767 1 93 70 31 0 30.4 0.315 23 0
In [5]:
data.describe()
Out[5]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
count 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000
mean 3.845052 120.894531 69.105469 20.536458 79.799479 31.992578 0.471876 33.240885 0.348958
std 3.369578 31.972618 19.355807 15.952218 115.244002 7.884160 0.331329 11.760232 0.476951
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.078000 21.000000 0.000000
25% 1.000000 99.000000 62.000000 0.000000 0.000000 27.300000 0.243750 24.000000 0.000000
50% 3.000000 117.000000 72.000000 23.000000 30.500000 32.000000 0.372500 29.000000 0.000000
75% 6.000000 140.250000 80.000000 32.000000 127.250000 36.600000 0.626250 41.000000 1.000000
max 17.000000 199.000000 122.000000 99.000000 846.000000 67.100000 2.420000 81.000000 1.000000
In [6]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
In [7]:
data.isnull().sum()
Out[7]:
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64
In [8]:
data.duplicated().sum()
Out[8]:
0
In [9]:
data.columns
Out[9]:
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')
In [10]:
data.shape
Out[10]:
(768, 9)
In [11]:
#VISUALIZATION
In [12]:
plt.bar(data['Pregnancies'],data['Age'])
plt.xticks(rotation=90)
plt.show()
In [13]:
fig=px.bar(data,x='Glucose',y='Age',color='Glucose')
fig.show()
In [14]:
fig=px.violin(data,x='Pregnancies',y='Age',color='Pregnancies')
fig.show()
In [15]:
plt.figure(figsize=(10,4))
sns.countplot(x='SkinThickness', data=data, color='black')
plt.title('SkinThickness and count')
plt.xticks(rotation=90)
plt.show()
In [16]:
sns.lineplot(x='Glucose', y='BMI', data=data).set_title('Variation of Glucose with BMI')
Out[16]:
Text(0.5, 1.0, 'Variation of Glucose with BMI')
In [17]:
sns.barplot(data['Outcome'],data['DiabetesPedigreeFunction'],color='r')
plt.xticks(rotation=90)
plt.show()
In [18]:
plt.figure(figsize=(8, 4))
sns.scatterplot(data=data, x='BloodPressure', y='Age')
plt.title('BloodPressure and there Age')
plt.xlabel('BloodPressure')
plt.ylabel('Age')
plt.show()
In [19]:
sns.displot(data["DiabetesPedigreeFunction"])
Out[19]:
<seaborn.axisgrid.FacetGrid at 0x14b63e7d640>
In [20]:
sns.relplot(x='SkinThickness',y='Age',data=data)
Out[20]:
<seaborn.axisgrid.FacetGrid at 0x14b63e76d00>
In [21]:
sns.boxplot(x='Outcome',y='Age',data=data)
Out[21]:
<AxesSubplot:xlabel='Outcome', ylabel='Age'>
In [22]:
sns.violinplot(x='Outcome',y='BloodPressure',data=data)
Out[22]:
<AxesSubplot:xlabel='Outcome', ylabel='BloodPressure'>
In [23]:
sns.countplot(data=data, x="Pregnancies", color="yellowgreen")
Out[23]:
<AxesSubplot:xlabel='Pregnancies', ylabel='count'>
In [24]:
sns.histplot(data, x="Pregnancies", hue="Outcome", multiple="stack",bins = 50, kde=True)
Out[24]:
<AxesSubplot:xlabel='Pregnancies', ylabel='Count'>
In [25]:
#MODEL BUILDING
In [26]:
X = data.drop(['Outcome'], axis=1)
y = data['Outcome']
In [27]:
X.head()
Out[27]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age
0 6 148 72 35 0 33.6 0.627 50
1 1 85 66 29 0 26.6 0.351 31
2 8 183 64 0 0 23.3 0.672 32
3 1 89 66 23 94 28.1 0.167 21
4 0 137 40 35 168 43.1 2.288 33
In [28]:
y.head()
Out[28]:
0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64
In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
In [30]:
X_train.shape, X_test.shape
Out[30]:
((514, 8), (254, 8))
In [31]:
X_train.dtypes
Out[31]:
Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
dtype: object
In [32]:
from sklearn.tree import DecisionTreeClassifier
DTree = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=0)
DTree.fit(X_train, y_train)
Out[32]:
DecisionTreeClassifier(max_depth=3, random_state=0)
In [33]:
y_pred = DTree.predict(X_test)
y_pred
Out[33]:
array([1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0], dtype=int64)
In [34]:
from sklearn.metrics import accuracy_score
print('model accuracy score with criterion gini index: {0:04f}'. format (accuracy_score(y_test, y_pred)))
model accuracy score with criterion gini index: 0.692913
In [35]:
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix
Out[35]:
array([[117,  51],
       [ 27,  59]], dtype=int64)
In [36]:
plt.figure(figsize=(8,6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='crest', cbar=False)
Out[36]:
<AxesSubplot:>
In [37]:
from sklearn.metrics import classification_report
class_report = classification_report(y_test, y_pred)
print(class_report)
              precision    recall  f1-score   support

           0       0.81      0.70      0.75       168
           1       0.54      0.69      0.60        86

    accuracy                           0.69       254
   macro avg       0.67      0.69      0.68       254
weighted avg       0.72      0.69      0.70       254

In [38]:
plt.figure(figsize=(12,8))
from sklearn import tree
tree.plot_tree(DTree.fit(X_train, y_train))
Out[38]:
[Text(0.5, 0.875, 'X[1] <= 154.5\ngini = 0.457\nsamples = 514\nvalue = [332, 182]'),
 Text(0.25, 0.625, 'X[7] <= 30.5\ngini = 0.382\nsamples = 432\nvalue = [321, 111]'),
 Text(0.125, 0.375, 'X[1] <= 127.5\ngini = 0.24\nsamples = 251\nvalue = [216, 35]'),
 Text(0.0625, 0.125, 'gini = 0.153\nsamples = 203\nvalue = [186, 17]'),
 Text(0.1875, 0.125, 'gini = 0.469\nsamples = 48\nvalue = [30, 18]'),
 Text(0.375, 0.375, 'X[5] <= 26.95\ngini = 0.487\nsamples = 181\nvalue = [105, 76]'),
 Text(0.3125, 0.125, 'gini = 0.105\nsamples = 36\nvalue = [34, 2]'),
 Text(0.4375, 0.125, 'gini = 0.5\nsamples = 145\nvalue = [71, 74]'),
 Text(0.75, 0.625, 'X[5] <= 28.7\ngini = 0.232\nsamples = 82\nvalue = [11, 71]'),
 Text(0.625, 0.375, 'X[5] <= 25.35\ngini = 0.486\nsamples = 12\nvalue = [5, 7]'),
 Text(0.5625, 0.125, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]'),
 Text(0.6875, 0.125, 'gini = 0.469\nsamples = 8\nvalue = [5, 3]'),
 Text(0.875, 0.375, 'X[6] <= 1.428\ngini = 0.157\nsamples = 70\nvalue = [6, 64]'),
 Text(0.8125, 0.125, 'gini = 0.116\nsamples = 65\nvalue = [4, 61]'),
 Text(0.9375, 0.125, 'gini = 0.48\nsamples = 5\nvalue = [2, 3]')]
In [ ]: